# Date- 5/3/23
# Script No. 1
# Name: DATA 
# File target: 1) Importing and pulling the relevant variables from PPDP 
#                 and Creating the IDV of activists relative power from PPDB 
#              2) Importing and pulling the relevant variables from CSES and merging it with the PPDP data frame
#                 and Creating the DV representation gap from IMD
#              3) Creating the DV party extremity and polarization from IMD

library(tidyverse)
library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(psych)
library(data.table)
library(mltools)
library(ggpubr)
library(kableExtra)
library(frequency)

setwd("C:/Users/yaira/Desktop/ideological polarization/FINAL/Do and data")
#####################################################################################################
################################### (1) Importing the PPDB dara and #################################
###################################     creating the activists IDV  #################################
#####################################################################################################

############################
### loding PPDP round 1a ###
############################
ppdb1a <- read_csv("PPDB_Round1a_v3.csv")

# removing countries/years that do not match to CSES
# AUS
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Australia" & ppdb1a$YEAR != 2013),]
# AUT
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Austria" & ppdb1a$YEAR != 2013),]
# BLG
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Belgium"),]
# CAN
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Canada" & ppdb1a$YEAR != 2015),]
# CAN
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "CANADA" & ppdb1a$YEAR != 2015),]
# CZE
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Czech Republic"),]
# DNK
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Denmark"),]
# FRA
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "France" & ppdb1a$YEAR != 2012),]
# DEU
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Germany" & ppdb1a$YEAR != 2013),]
# HUN
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Hungary" & ppdb1a$YEAR != 2014),]
# IRL
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Ireland" & ppdb1a$YEAR != 2011),]
# ISR
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Israel" & ppdb1a$YEAR != 2013),]
# ITA
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Italy" & ppdb1a$YEAR != 2014),]
# NLD
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Netherlands" & ppdb1a$YEAR != 2012),]
# NOR
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Norway" & ppdb1a$YEAR != 2013),]
# POL
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Poland" & ppdb1a$YEAR != 2011),]
# PRT
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Portugal" & ppdb1a$YEAR != 2011),]
# SPA
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Spain"),]
# SWE
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "Sweden" & ppdb1a$YEAR != 2014),]
# GBR
ppdb1a <- ppdb1a[!(ppdb1a$COUNTRY == "United Kingdom" & ppdb1a$YEAR != 2010),]

# pulling only relavent columns
ppdb1a <- ppdb1a[,c(2:4, 7, 11, 204:209, 339, 342:354, 341)]
colnames(ppdb1a)

############################
### loding PPDP round 1b ###
############################
ppdb1b <- read_csv("PPDB_Round_1b_v1_csv.csv")

# removing countries/years that do not match to CSES

# BRA
ppdb1b <- ppdb1b[!(ppdb1b$COUNTRY == "Japan"),]
# JPN
ppdb1b <- ppdb1b[!(ppdb1b$COUNTRY == "South Korea"),]

# pulling only relavent columns
ppdb1b <- ppdb1b[,c(2:4, 7, 11, 206:211, 332:345)]
colnames(ppdb1b)

# mereging both waves
colnames(ppdb1a) == colnames(ppdb1b)

ppdb <- full_join(ppdb1a, ppdb1b)
ppdb$COUNTRY[(ppdb$COUNTRY == "CANADA")]  <- "Canada"
colnames(ppdb)

# ordering party families from left to right
ppdb$pfam <- ppdb$PARTYFAM1
freq(ppdb$pfam)
hist(ppdb$PARTYFAM1)

# reorder the valuse of pfam from left (0) to right (6)
ppdb$pfam[which(ppdb$pfam == 5)] <- 0 # Left so
ppdb$pfam[which(ppdb$pfam == 9)] <- 5 # CD
ppdb$pfam[which(ppdb$pfam == 2)] <- 2 # SD
ppdb$pfam[which(ppdb$pfam == 6)] <- 7 # RR
ppdb$pfam[which(ppdb$pfam == 7)] <- 7 # RR
ppdb$pfam[which(ppdb$pfam == 1)] <- 6 # CON
ppdb$pfam[which(ppdb$pfam == 4)] <- 1 # GREEN
ppdb$pfam[which(ppdb$pfam == 3)] <- 4 # LIB
ppdb$pfam[which(ppdb$pfam == 8)] <- 3 # Ag
ppdb$pfam[which(ppdb$pfam <  0)] <- 0 # Le

view(select(ppdb, COUNTRY, PNAME, pfam, PARTYFAM1))

#### Creating activits relative power IDV ###
# 1) with weights
ppdb$activists <- NA
ppdb$activists <- ppdb$selected/(ppdb$voters + ppdb$members + ppdb$selected + 
                                   ppdb$appointed +ppdb$leader) 
view(select(ppdb, COUNTRY, PTYNAME, PNAME, activists))
ppdb <- ppdb %>% drop_na(activists)

# 2) without weights

# voters
for (i in 1:(nrow(ppdb))){
  if(ppdb$voters[i] > 4) {
    ppdb$voters2[i] <- 2
  } else if(ppdb$voters[i] < 5 & ppdb$voters[i] > 0) {
    ppdb$voters2[i] <- 1
  } else {ppdb$voters2[i] <- 0
  }
}
ppdb$voters 
ppdb$voters2

# members
for (i in 1:(nrow(ppdb))){
  if(ppdb$members[i] > 4) {
    ppdb$members2[i] <- 2
  } else if(ppdb$members[i] < 5 & ppdb$members[i] > 0) {
    ppdb$members2[i] <- 1
  } else {ppdb$members2[i] <- 0
  }
}
ppdb$members 
ppdb$members2

# selected
for (i in 1:(nrow(ppdb))){
  if(ppdb$selected[i] > 4) {
    ppdb$selected2[i] <- 2
  } else if(ppdb$selected[i] < 5 & ppdb$selected[i] > 0) {
    ppdb$selected2[i] <- 1
  } else {ppdb$selected2[i] <- 0
  }
}
ppdb$selected 
ppdb$selected2

# appointed
for (i in 1:(nrow(ppdb))){
  if(ppdb$appointed[i] > 4) {
    ppdb$appointed2[i] <- 2
  } else if(ppdb$appointed[i] < 5 & ppdb$appointed[i] > 0) {
    ppdb$appointed2[i] <- 1
  } else {ppdb$appointed2[i] <- 0
  }
}
ppdb$appointed 
ppdb$appointed2

# leader
for (i in 1:(nrow(ppdb))){
  if(ppdb$leader[i] > 4) {
    ppdb$leader2[i] <- 2
  } else if(ppdb$leader[i] < 5 & ppdb$leader[i] > 0) {
    ppdb$leader2[i] <- 1
  } else {ppdb$leader2[i] <- 0
  }
}
ppdb$leader 
ppdb$leader2

# activiets
ppdb <- 
  ppdb %>%
  group_by(PCSESID) %>%
  mutate(activists2 = selected2/
           (voters2 + members2 + selected2 + appointed2 + leader2))

scatter.smooth(ppdb$activists, ppdb$activists2)
corr.test(ppdb$activists, ppdb$activists2)

# saving as csv
write.csv(ppdb, "C:\\Users\\yaira\\Desktop\\ideological polarization\\FINAL\\Do and Data\\ppdb.csv")

#####################################################################################################
################################### (2) Importing the CSES dara and #################################
###################################     creating representaion DV   #################################
#####################################################################################################

# Loading data from CSES IMD
cses.imd <- read_csv("cses_imd.csv")

# capturing only module 4 from the IMD
cses.imd.4 <- split.data.frame(cses.imd,cses.imd$IMD1008_MOD_4)
cses.4 <-  cses.imd.4[[2]]

## Creating new var that contains for every voter the perceived party placement  
## for the party he voted for.

cses.4$voted.for <- NA

for (i in 1:nrow(cses.4)){
  if (cses.4$IMD3002_PR_1[i] <9000000){
    cses.4$voted.for[i] <- cses.4$IMD3002_PR_1[i]
  } else if (cses.4$IMD3002_LH_PL[i] <9000000){
    cses.4$voted.for[i] <- cses.4$IMD3002_LH_PL[i]
  } else if (cses.4$IMD3002_LH_DC[i] <9000000){
    cses.4$voted.for[i] <- cses.4$IMD3002_LH_DC[i]
  }
}

summary(cses.4$voted.for)
describe(cses.4$voted.for)

# Droping NAs
cses.4.noNA <- cses.4 %>% drop_na(voted.for)

## Creating a variable for parties' left-right placement by those who voted for them
cses.4.noNA$voted.for.score <- NA

# amendments: in germany voted for var (IMD3002_LH_PL)  refers to CDU as 2760001 
# and party placement (IMD3007_A) as 2760002 (see IMD5000_A).
cses.4.noNA$IMD5000_A[which(cses.4.noNA$IMD1006_NAM=="Germany")] <- 2760001

for (i in 1:nrow(cses.4.noNA)){
  score <- cses.4.noNA$voted.for[i]
  if (cses.4.noNA$IMD5000_A[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_A[i]
  } else if (cses.4.noNA$IMD5000_B[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_B[i]
  } else if (cses.4.noNA$IMD5000_C[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_C[i]
  } else if (cses.4.noNA$IMD5000_D[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_D[i]
  } else if (cses.4.noNA$IMD5000_E[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_E[i]
  } else if (cses.4.noNA$IMD5000_F[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_F[i]
  } else if (cses.4.noNA$IMD5000_G[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_G[i]
  } else if (cses.4.noNA$IMD5000_H[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_H[i]
  } else if (cses.4.noNA$IMD5000_I[i] == score){
    cses.4.noNA$voted.for.score[i] <- cses.4.noNA$IMD3007_I[i]
  }
}

# test
select(cses.4.noNA, voted.for, IMD5000_A, IMD3007_A, voted.for.score)

summary(cses.4.noNA$voted.for.score)
describe(cses.4.noNA$voted.for.score)

# droping NA's
cses.4.noNA$voted.for.score[cses.4.noNA$voted.for.score>10] <- NA
cses.4.noNA <- cses.4.noNA %>% drop_na(voted.for.score)

## measure the gap between voter's self placement to voter's perceived party placement
voter.4 = list()
Vcdf.4 = list()
party.4 = list()
Pcdf.4 = list()
rep.gap.4 = list()
pol.gap.4 = list()
cses.4.noNA$IMD3006[cses.4.noNA$IMD3006>10] <- NA
cses.4.noNA <- cses.4.noNA %>% drop_na(IMD3006)

# Loading Political Party Database
ppdb <- read_csv("ppdb.csv")
head(ppdb)

for (i in unique(ppdb$PCSESID)) {
  # CDF for voters self placement
  v4 <- as.matrix(na.omit(cses.4.noNA$IMD3006[which(cses.4.noNA$voted.for == i)]))
  voter.4[[i]] <- v4
  Vcdf.4[[i]] <- empirical_cdf(v4, ubounds=seq(0, 10, by=1.0))  
  # CDF for voters perceived party placement
  p4 <- as.matrix(na.omit(cses.4.noNA$voted.for.score[which(cses.4.noNA$voted.for == i)]))
  party.4[[i]] <- p4
  Pcdf.4[[i]] <- empirical_cdf(p4, ubounds=seq(0, 10, by=1.0))
  # measuring the gap between the CDFs
  rep.gap.4[[as.character(i)]] <- sum(abs(Pcdf.4[[i]][[3]] - Vcdf.4[[i]][[3]]))
  pol.gap.4[[as.character(i)]] <- sum(Pcdf.4[[i]][[3]] - Vcdf.4[[i]][[3]])
}

## adding the representaion gap and polarizaation to the PPDB database
# rep
Rep.Gap.4 <- unlist(rep.gap.4)
Rep.Gap.4 <- data.frame("PCSESID" = names(Rep.Gap.4), "party.gap" = (Rep.Gap.4),
                        stringsAsFactors = F)
ppdb.noNA <- merge(ppdb,Rep.Gap.4, by = "PCSESID")
# pol
Pol.Gap.4 <- unlist(pol.gap.4)
Pol.Gap.4 <- data.frame("PCSESID" = names(Pol.Gap.4), "party.pol" = (Pol.Gap.4),
                        stringsAsFactors = F)
ppdb.noNA <- merge(ppdb.noNA, Pol.Gap.4, by = "PCSESID")
View(select(ppdb.noNA, COUNTRY, PTYNAME, party.gap, party.pol)) 

##### adding Chile, Hungary and Italy from cses.5 #####

cses.5 <- read_csv("cses5.csv")

## Creating new var that contains for every voter the perceived party placement  
## for the party that he voted to.

cses.5$voted.for <- NA

for (i in 1:nrow(cses.5)){
  if (cses.5$E3013_LH_PL[i] <900000){
    cses.5$voted.for[i] <- cses.5$E3013_LH_PL[i]
  } else if (cses.5$E3013_LH_DC[i] <900000){
    cses.5$voted.for[i] <- cses.5$E3013_LH_DC[i]
  }
}

summary(cses.5$voted.for)
describe(cses.5$voted.for)

# Droping NAs
cses.5.noNA <- cses.5 %>% drop_na(voted.for)

cses.5.noNA$voted.for.score <- NA

for (i in 1:nrow(cses.5.noNA)){
  
  if (cses.5.noNA$voted.for[i] == 152001){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_A[i]
  } else if (cses.5.noNA$voted.for[i] == 348001){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_A[i]
  } else if (cses.5.noNA$voted.for[i] == 152002){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_B[i]
  } else if (cses.5.noNA$voted.for[i] == 348002){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_B[i]
  } else if (cses.5.noNA$voted.for[i] == 380002){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_B[i]
  } else if (cses.5.noNA$voted.for[i] == 152003){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_C[i]
  } else if (cses.5.noNA$voted.for[i] == 348003){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_C[i]
  } else if (cses.5.noNA$voted.for[i] == 380003){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_C[i]
  } else if (cses.5.noNA$voted.for[i] == 152004){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_D[i]
  } else if (cses.5.noNA$voted.for[i] == 348004){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_D[i]
  } else if (cses.5.noNA$voted.for[i] == 380004){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_D[i]
  } else if (cses.5.noNA$voted.for[i] == 152005){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_E[i]
  } else if (cses.5.noNA$voted.for[i] == 380008){
    cses.5.noNA$voted.for.score[i] <- cses.5.noNA$E3019_H[i]
  } 
}

# test
(cses.5.noNA[cses.5.noNA[, "voted.for"] == 348002,c("voted.for","E3019_B","voted.for.score")])

summary(cses.5.noNA$voted.for.score)
describe(cses.5.noNA$voted.for.score)

cses.5.noNA$voted.for.score[cses.5.noNA$voted.for.score>10] <- NA

## measure the gap between voter's self placement to voter's perceived party placement

voter.5 = list()
Vcdf.5 = list()
party.5 = list()
Pcdf.5 = list()
rep.gap.5 = list()
pol.gap.5 = list()

# CSES 5 self-placement
cses.5.noNA$E3020[cses.5.noNA$E3020>10] <- NA
cses.5.noNA <- cses.5.noNA %>% drop_na(E3020)

cses.5.noNA$voted.for.score[cses.5.noNA$voted.for.score>10] <- NA
cses.5.noNA <- cses.5.noNA %>% drop_na(voted.for.score)

for (i in unique(ppdb.noNA$PCSESID)) {
  # CDF for voters self placement
  v5 <- as.matrix(na.omit(cses.5.noNA$E3020[which(cses.5.noNA$voted.for == i)]))
  voter.5[[i]] <- v5
  Vcdf.5[[i]] <- empirical_cdf(v5, ubounds=seq(0, 10, by=1.0))  
  # CDF for voters perceived party placement
  p5 <- as.matrix(na.omit(cses.5.noNA$voted.for.score[which(cses.5.noNA$voted.for == i)]))
  party.5[[i]] <- p5
  Pcdf.5[[i]] <- empirical_cdf(p5, ubounds=seq(0, 10, by=1.0))
  # measuring the gap between the CDFs
  rep.gap.5[[as.character(i)]] <- sum(abs(Pcdf.5[[i]][[3]]-Vcdf.5[[i]][[3]]))
  pol.gap.5[[as.character(i)]] <- sum(Pcdf.5[[i]][[3]]-Vcdf.5[[i]][[3]])
}

## adding the representaion gap and polarization to the PPDB database

Rep.Gap.5 <- unlist(rep.gap.5)
Rep.Gap.5 <- data.frame("PCSESID" = names(Rep.Gap.5), "party.gap.5" = (Rep.Gap.5),
                        stringsAsFactors = F)
ppdb.noNA <- merge(ppdb.noNA,Rep.Gap.5, by = "PCSESID")

Pol.Gap.5 <- unlist(pol.gap.5)
Pol.Gap.5 <- data.frame("PCSESID" = names(Pol.Gap.5), "party.pol.5" = (Pol.Gap.5),
                        stringsAsFactors = F)
ppdb.noNA <- merge(ppdb.noNA,Pol.Gap.5, by = "PCSESID")

##### adding Netherland's parties from cses.3 #####

cses.imd.3<- split.data.frame(cses.imd,cses.imd$IMD1008_MOD_3)
cses.3 <-  cses.imd.3[[2]]

## Creating new var which contains the perceived party placement of every voter 
## for the party that he voted to.

# Creating a voted for variable

cses.3$voted.for <- NA

for (i in 1:nrow(cses.3)){
  if (cses.3$IMD3002_PR_1[i] <9000000){
    cses.3$voted.for[i] <- cses.3$IMD3002_PR_1[i]
  }  else if (cses.3$IMD3002_LH_PL[i] <9000000){
    cses.3$voted.for[i] <- cses.3$IMD3002_LH_PL[i]
  } else if (cses.3$IMD3002_LH_DC[i] <9000000){
    cses.3$voted.for[i] <- cses.3$IMD3002_LH_DC[i]
  }
}

summary(cses.3$voted.for)

cses.3.noNA <- cses.3 %>% drop_na(voted.for)

cses.3.noNA$voted.for.score <- NA

for (i in 1:nrow(cses.3.noNA)){
  score <- cses.3.noNA$voted.for[i]
  if (cses.3.noNA$IMD5000_A[i] == score){
    cses.3.noNA$voted.for.score[i] <- cses.3.noNA$IMD3007_A[i]
  } else if (cses.3.noNA$IMD5000_B[i] == score){
    cses.3.noNA$voted.for.score[i] <- cses.3.noNA$IMD3007_B[i]
  } else if (cses.3.noNA$IMD5000_C[i] == score){
    cses.3.noNA$voted.for.score[i] <- cses.3.noNA$IMD3007_C[i]
  } else if (cses.3.noNA$IMD5000_D[i] == score){
    cses.3.noNA$voted.for.score[i] <- cses.3.noNA$IMD3007_D[i]
  } else if (cses.3.noNA$IMD5000_E[i] == score){
    cses.3.noNA$voted.for.score[i] <- cses.3.noNA$IMD3007_E[i]
  } else if (cses.3.noNA$IMD5000_G[i] == score){
    cses.3.noNA$voted.for.score[i] <- cses.3.noNA$IMD3007_G[i]
  } else if (cses.3.noNA$IMD5000_H[i] == score){
    cses.3.noNA$voted.for.score[i] <- cses.3.noNA$IMD3007_H[i]
  } else if (cses.3.noNA$IMD5000_I[i] == score){
    cses.3.noNA$voted.for.score[i] <- cses.3.noNA$IMD3007_I[i]
  }
}

select(cses.3.noNA, voted.for, IMD5000_B, IMD3007_B, voted.for.score)

summary(cses.3.noNA$voted.for.score)
describe(cses.3.noNA$voted.for.score)

# measure the gap between voter's self placement to voter's perceived party placement

voter.3 = list()
Vcdf.3 = list()
party.3 = list()
Pcdf.3 = list()
rep.gap.3 = list()
pol.gap.3 = list()

cses.3.noNA$IMD3006[cses.3.noNA$IMD3006>10] <- NA
cses.3.noNA <- cses.3.noNA %>% drop_na(IMD3006)

cses.3.noNA$voted.for.score[cses.3.noNA$voted.for.score>10] <- NA
cses.3.noNA <- cses.3.noNA %>% drop_na(voted.for.score)

for (i in unique(ppdb.noNA$PCSESID)) {
  # CDF for voters self placement
  v3 <- as.matrix(na.omit(cses.3.noNA$IMD3006[which(cses.3.noNA$voted.for == i)]))
  voter.3[[i]] <- v3
  Vcdf.3[[i]] <- empirical_cdf(v3, ubounds=seq(0, 10, by=1.0))  
  # CDF for voters perceived party placement
  p3 <- as.matrix(na.omit(cses.3.noNA$voted.for.score[which(cses.3.noNA$voted.for == i)]))
  party.3[[i]] <- p3
  Pcdf.3[[i]] <- empirical_cdf(p3, ubounds=seq(0, 10, by=1.0))
  # measuring the gap between the CDFs
  rep.gap.3[[as.character(i)]] <- sum(abs(Pcdf.3[[i]][[3]]-Vcdf.3[[i]][[3]]))
  pol.gap.3[[as.character(i)]] <- sum(Pcdf.3[[i]][[3]]-Vcdf.3[[i]][[3]])
}

Rep.Gap.3 <- unlist(rep.gap.3)
Rep.Gap.3 <- data.frame("PCSESID" = names(Rep.Gap.3), "party.gap.3" = (Rep.Gap.3),
                        stringsAsFactors = F)

ppdb.noNA <- merge(ppdb.noNA,Rep.Gap.3, by = "PCSESID")

Pol.Gap.3 <- unlist(pol.gap.3)
Pol.Gap.3 <- data.frame("PCSESID" = names(Pol.Gap.3), "party.pol.3" = (Pol.Gap.3),
                        stringsAsFactors = F)

ppdb.noNA <- merge(ppdb.noNA,Pol.Gap.3, by = "PCSESID")

##### adding the ex-cses.4 data to the PPDB data frame #####

ppdb.noNA$count.V <- NA
ppdb.noNA$count.P <- NA
ppdb.noNA$mean.V <- NA
ppdb.noNA$mean.P <- NA

for (i in 1:nrow(ppdb.noNA)){
  
  # entering valuse for the Nethelands from modul 3
  if (ppdb.noNA$COUNTRY[i] == "Netherlands"){
    
    party.gap <- i
    ppdb.noNA$party.gap[i] <- Rep.Gap.3[party.gap,2]
    
    party.pol <- i
    ppdb.noNA$party.pol[i] <- Pol.Gap.3[party.pol,2]
    
    countV <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.V[i] <- nrow(voter.3[[countV]])
    
    countP <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.P[i] <- nrow(party.3[[countP]])
    
    meanV <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.V[i] <- mean(voter.3[[meanV]])
    
    meanP <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.P[i] <- mean(party.3[[meanP]])
    
  } else if (ppdb.noNA$COUNTRY[i] == "Chile"){
    
    party.gap <- i
    ppdb.noNA$party.gap[i] <- Rep.Gap.5[party.gap,2]
    
    party.pol <- i
    ppdb.noNA$party.pol[i] <- Pol.Gap.5[party.pol,2]
    
    countV <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.V[i] <- nrow(voter.5[[countV]])
    
    countP <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.P[i] <- nrow(party.5[[countP]])
    
    meanV <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.V[i] <- mean(voter.5[[meanV]])
    
    meanP <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.P[i] <- mean(party.5[[meanP]])
    
  } else if (ppdb.noNA$COUNTRY[i] == "Hungary"){
    
    party.gap <- i
    ppdb.noNA$party.gap[i] <- Rep.Gap.5[party.gap,2]
    
    party.pol <- i
    ppdb.noNA$party.pol[i] <- Pol.Gap.5[party.pol,2]
    
    countV <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.V[i] <- nrow(voter.5[[countV]])
    
    countP <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.P[i] <- nrow(party.5[[countP]])
    
    meanV <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.V[i] <- mean(voter.5[[meanV]])
    
    meanP <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.P[i] <- mean(party.5[[meanP]])
    
  } else if (ppdb.noNA$COUNTRY[i] == "Italy"){
    
    party.gap <- i
    ppdb.noNA$party.gap[i] <- Rep.Gap.5[party.gap,2]
    
    party.pol <- i
    ppdb.noNA$party.pol[i] <- Pol.Gap.5[party.pol,2]
    
    countV <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.V[i] <- nrow(voter.5[[countV]])
    
    countP <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.P[i] <- nrow(party.5[[countP]])
    
    meanV <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.V[i] <- mean(voter.5[[meanV]])
    
    meanP <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.P[i] <- mean(party.5[[meanP]])
    
  } else {
    
    count.V <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.V[i] <- nrow(voter.4[[count.V]])
    
    count.P <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$count.P[i] <- nrow(party.4[[count.P]])
    
    mean.V <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.V[i] <- mean(voter.4[[mean.V]])
    
    mean.P <- ppdb.noNA$PCSESID[i]
    ppdb.noNA$mean.P[i] <- mean(party.4[[mean.P]])
  } 
}

View(select(ppdb.noNA, COUNTRY, PCSESID, PTYNAME, party.gap, party.pol, activists, count.V, count.P))

describe(ppdb.noNA$party.gap[which(ppdb.noNA$count.P >=14)])
ppdb.noNA$party.gap[which(ppdb.noNA$count.P < 14)] <- NA

ppdb.noNA %>% summarise (count = sum(is.na(party.gap)))
ppdb.noNA %>% summarise (count = sum(is.na(activists)))

summary(ppdb.noNA$activists)
describe(ppdb.noNA$activists)
freq(ppdb.noNA$activists)

ppdb.fin <- ppdb.noNA %>% drop_na (party.gap, activists)
view(select(ppdb.fin, COUNTRY, PTYNAME, PCSESID, party.gap, activists, count.V))

summary(ppdb.fin$activists)
describe(ppdb.fin$activists)

summary(ppdb.fin$party.gap)
describe(ppdb.fin$party.gap)

scatter.smooth(ppdb.fin$activists, ppdb.fin$party.gap)
scatter.smooth(ppdb.fin$activists, ppdb.fin$party.pol.abs)

# saving as csv
write.csv(ppdb.fin, "C:\\Users\\yaira\\Desktop\\ideological polarization\\FINAL\\Do and Data\\ppdb.fin.csv")

###################################################################################################################
#################################### (3) DV- party and voter extremity and polarization ###########################
###################################################################################################################

####################################
#### Calculating party extremity ###
####################################
# party extremity = ((party position - country mean positoin)/5)^2 #

# 1) Finding the mean party position for each country
ppdb.fin$mean.party.position <- NA

for (i in 1:nrow(ppdb.fin)){
  country <- ppdb.fin$COUNTRY[i]
  ppdb.fin$mean.party.position[which(ppdb.fin$COUNTRY==country)] <- 
    mean(ppdb.fin$mean.P[which(ppdb.fin$COUNTRY==country)])
}


## 2) Calculation party extremity
# Dalton formula
ppdb.fin$party.extremity <- NA
ppdb.fin$party.extremity <- ((ppdb.fin$mean.P - ppdb.fin$mean.party.position)/5)^2

summary(ppdb.fin$party.extremity)
hist(ppdb.fin$party.extremity)


#####################################
#### Calculating voters extremity ###
#####################################

# voters extremity = [((voters of party i mean position ) - (voters of country j mean position))/5]^2  #

# 1) Calculating voters' mean position per country 
ppdb.fin$mean.voter.position <- NA

for (i in 1:nrow(ppdb.fin)){
  country <- ppdb.fin$COUNTRY[i]
  ppdb.fin$mean.voter.position[which(ppdb.fin$COUNTRY==country)] <- 
    mean(ppdb.fin$mean.V[which(ppdb.fin$COUNTRY==country)])
}


## 2) Calculating voters' extremity 

## by Dalton
ppdb.fin$voter.extremity <- NA
ppdb.fin$voter.extremity <- ((ppdb.fin$mean.voter.position - ppdb.fin$mean.V)/5)^2
summary(ppdb.fin$voter.extremity)
hist(ppdb.fin$voter.extremity)


##################################
### country level polarization ###
##################################

# DALTON: polarization = sqrt(sum(vote share * ((party position - mean party position)/5)^2))

ppdb.fin$party.polarization <- NA
for (i in 1:nrow(ppdb.fin)){
  polarization <- ppdb.fin$COUNTRY[i]
  ppdb.fin$party.polarization[which(ppdb.fin$COUNTRY == polarization)] <- 
    sqrt(sum(
      ppdb.fin$vote.share[which(ppdb.fin$COUNTRY == polarization)] * # vote share
        ((ppdb.fin$mean.P[which(ppdb.fin$COUNTRY == polarization)] - # party position 
            ppdb.fin$mean.party.position[which(ppdb.fin$COUNTRY == polarization)])/5)^2)) # country mean
}
summary(ppdb.fin$party.polarization)
hist(ppdb.fin$party.polarization)


###########################
### voters polarization ###
###########################

# DALTON: polarization = sqrt(sum(vote share * ((voter position - mean voter position)/5)^2))

ppdb.fin$voter.polarization <- NA
for (i in 1:nrow(ppdb.fin)){
  polarization <- ppdb.fin$COUNTRY[i]
  ppdb.fin$voter.polarization[which(ppdb.fin$COUNTRY == polarization)] <- 
    sqrt(sum(
      ppdb.fin$vote.share[which(ppdb.fin$COUNTRY == polarization)] * # vote share
        ((ppdb.fin$mean.V[which(ppdb.fin$COUNTRY == polarization)] - # party position 
            ppdb.fin$mean.voter.position[which(ppdb.fin$COUNTRY == polarization)])/5)^2)) # country mean
}
summary(ppdb.fin$voter.polarization)
hist(ppdb.fin$voter.polarization)

# adding country-year var
ppdb.fin$COUNTRY.YEAR <- NA
for (i in 1:nrow(ppdb.fin)){
  if(ppdb.fin$COUNTRY[i] == "Chile"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.5$E1004[which(cses.5$E1006_NAM == "Chile")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Hungary"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.5$E1004[which(cses.5$E1006_NAM == "Hungary")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Italy"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.5$E1004[which(cses.5$E1006_NAM == "Italy")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Australia"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Australia")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Austria"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Austria")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Brazil"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Brazil")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Canada"){
    ppdb.fin$COUNTRY.YEAR[i] <- unique(cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Canada")])[2]
  } else if(ppdb.fin$COUNTRY[i] == "France"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "France")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Germany"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Germany")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Ireland"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Ireland")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Israel"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Israel")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Mexico"){
    ppdb.fin$COUNTRY.YEAR[i] <- unique(cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Mexico")])[2]
  } else if(ppdb.fin$COUNTRY[i] == "Norway"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Norway")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Poland"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Poland")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Portugal"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Portugal")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Romania"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Romania")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Sweden"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Sweden")][1]
  } else if(ppdb.fin$COUNTRY[i] == "United Kingdom"){
    ppdb.fin$COUNTRY.YEAR[i] <- cses.4$IMD1004[which(cses.4$IMD1006_NAM == "Great Britain")][1]
  } else if(ppdb.fin$COUNTRY[i] == "Netherlands"){
    ppdb.fin$COUNTRY.YEAR[i] <- unique(cses.3$IMD1004[which(cses.3$IMD1006_NAM == "Netherlands")])[2]
  }
}

view(select(ppdb.fin, COUNTRY, COUNTRY.YEAR))

# activists at the country level
ppdb.fin <- 
  ppdb.fin %>%
  group_by(COUNTRY) %>%
  dplyr::mutate(coun.act = sum(activists),
                coun.act2 = sum(activists2))

ppdb.fin.coun <- 
  ppdb.fin %>%
  dplyr::select(COUNTRY, COUNTRY.YEAR, party.polarization, voter.polarization, coun.act, coun.act2,
                medleg, ENP, GALLSQ, elffrn)
ppdb.fin.coun <- unique(ppdb.fin.coun)

# saving as csv
write.csv(ppdb.fin, "C:\\Users\\yaira\\Desktop\\ideological polarization\\FINAL\\Do and Data\\ppdb.fin.csv")

